In [1]:
# Computations
import numpy as np
import pandas as pd
import numba

# Tools
import os
import datetime
import itertools
import pickle
from scipy import stats

# sklearn
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor, RandomForestRegressor,\
                             BaggingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')

# Visualisation libraries

## progressbar
import progressbar

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px

## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

House Prices: Advanced Regression Techniques

In this article, we use a dataset from Kaggle.com.

Competition Description

Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

Dataset

File descriptions

  • train.csv - the training set
  • test.csv - the test set
  • data_description.txt - full description of each column, originally prepared by Dean De Cock but lightly edited to match the column names used here
  • sample_submission.csv - a benchmark submission from a linear regression on year and month of sale, lot square footage, and number of bedrooms

Data fields

Here's a brief version of what you'll find in the data description file.

Feature Description Feature Description
SalePrice the property's sale price in dollars. HeatingQC Heating quality and condition
MSSubClass The building class CentralAir Central air conditioning
MSZoning The general zoning classification Electrical Electrical system
LotFrontage Linear feet of street connected to property 1stFlrSF First Floor square feet
LotArea Lot size in square feet 2ndFlrSF Second floor square feet
Street Type of road access LowQualFinSF Low quality finished square feet (all floors)
Alley Type of alley access GrLivArea Above grade (ground) living area square feet
LotShape General shape of property BsmtFullBath Basement full bathrooms
LandContour Flatness of the property BsmtHalfBath Basement half bathrooms
Utilities Type of utilities available FullBath Full bathrooms above grade
LotConfig Lot configuration HalfBath Half baths above grade
LandSlope Slope of property Bedroom Number of bedrooms above basement level
Neighborhood Physical locations within Ames city limits Kitchen Number of kitchens
Condition1 Proximity to main road or railroad KitchenQual Kitchen quality
Condition2 Proximity to main road or railroad (if a second is present) TotRmsAbvGrd Total rooms above grade (does not include bathrooms)
BldgType Type of dwelling Functional Home functionality rating
HouseStyle Style of dwelling Fireplaces Number of fireplaces
OverallQual Overall material and finish quality FireplaceQu Fireplace quality
OverallCond Overall condition rating GarageType Garage location
YearBuilt Original construction date GarageYrBlt Year garage was built
YearRemodAdd Remodel date GarageFinish Interior finish of the garage
RoofStyle Type of roof GarageCars Size of garage in car capacity
RoofMatl Roof material GarageArea Size of garage in square feet
Exterior1st Exterior covering on house GarageQual Garage quality
Exterior2nd Exterior covering on house (if more than one material) GarageCond Garage condition
MasVnrType Masonry veneer type PavedDrive Paved driveway
MasVnrArea Masonry veneer area in square feet WoodDeckSF Wood deck area in square feet
ExterQual Exterior material quality OpenPorchSF Open porch area in square feet
ExterCond Present condition of the material on the exterior EnclosedPorch Enclosed porch area in square feet
Foundation Type of foundation 3SsnPorch Three season porch area in square feet
BsmtQual Height of the basement ScreenPorch Screen porch area in square feet
BsmtCond General condition of the basement PoolArea Pool area in square feet
BsmtExposure Walkout or garden level basement walls PoolQC Pool quality
BsmtFinType1 Quality of basement finished area Fence Fence quality
BsmtFinSF1 Type 1 finished square feet MiscFeature Miscellaneous feature not covered in other categories
BsmtFinType2 Quality of second finished area (if present) MiscVal Value of miscellaneous feature
BsmtFinSF2 Type 2 finished square feet MoSold Month Sold
BsmtUnfSF Unfinished square feet of basement area YrSold Year Sold
TotalBsmtSF Total square feet of basement area SaleType Type of sale
Heating Type of heating SaleCondition Condition of sale
In [2]:
PATH = 'house-prices-advanced-regression-techniques'
Data = pd.read_csv(PATH+'/House_Prices.csv', keep_default_na=False)

Target = 'SalePrice'
Train = Data.loc[Data.Set == 'Train'].drop(columns = ['Set'])
Train['SalePrice'] = Train['SalePrice'].astype(float)
Test = Data.loc[Data.Set == 'Test'].drop(columns = ['Set', Target])

def Search_List(Key, List): return [s for s in List if Key in s]

def Data_Plot(Inp, W = False):
    data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
    Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
    data_info = data_info.join(Temp, how='outer')
    data_info ['Size'] = Inp.shape[0]
    data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
    data_info.index.name = 'Features'
    data_info = data_info.reset_index(drop = False)
    #
    fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
                color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
                 hover_data = data_info.columns)
    fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
                                                         bordercolor="DarkGray", borderwidth=1))
    fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
    fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
    if W:
        fig.update_layout(width = W)
    fig.show()
    return data_info
    
    
data_info = Data_Plot(Data).sort_values('Features', ascending=True)

Distribution of Observations

In [3]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
ax = ax.ravel()
# Top
_ = sns.distplot(Train['SalePrice'], color= 'Indigo' , norm_hist=True, ax = ax[0])
_ = ax[0].set_ylim([0, 1e-5])
_ = ax[0].set_xlabel('Sale Price')
_ = ax[0].set_title('Distribution of Observations')
_ = stats.probplot(Train['SalePrice'], plot= ax[1])
_ = ax[1].set_ylim([0, 8e5])
_ = ax[1].set_title('Probability Plot: Sale Price')
# Bottom
_ = sns.distplot(np.log(Train['SalePrice']), color= 'DarkGreen' , norm_hist=True, ax = ax[2])
_ = ax[2].set_ylim([0, 1.2])
_ = ax[2].set_xlabel('log(Sale Price)')
_ = ax[2].set_title('Distribution of Observations')
_ = stats.probplot(np.log(Train['SalePrice']), plot= ax[3])
_ = ax[3].set_ylim([10, 14])
_ = ax[3].set_title('Probability Plot: log(Sale Price)')

Therefore,

In [4]:
Train['SalePrice'] = np.log(Train['SalePrice'])

Data Correlations

Let's take a look at the variance of the features.

In [5]:
Fig, ax = plt.subplots(figsize=(17,16))
Temp = Train.drop(columns = [Target, 'ID']).var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T

_ = sns.heatmap(Temp, ax=ax, annot=True, square=True,  cmap =sns.color_palette("OrRd", 20),
                  linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 8},
                  cbar_kws={'label': 'Feature Variance', "aspect":40, "shrink": .4, "orientation": "horizontal"})
_ = ax.set_yticklabels('')

High variance for some features can have a negative impact on our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance. For this reason, we take a unique approach in preparing the data for end-to-end modeling. In doing so, we can separate numeric and categorical features and treat each group separately.

Modeling

In [6]:
X = Train.drop(columns = [Target, 'ID'])
y = Train[Target]

Temp = X.dtypes.astype(str).to_frame(name='DataType').sort_values(by = 'DataType')

def Header(Text, L = 100, C1 = Back.BLUE, C2 = Fore.BLUE):
    print(C1 + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + C2 +
          Style.NORMAL +  (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = Fore.BLUE): print(C + Style.NORMAL + L*'=' + Style.RESET_ALL)
    
Numeric_Columns = Temp.loc[Temp['DataType'] != 'object'].index.tolist()
Categorical_Columns = Temp.loc[Temp['DataType'] == 'object'].index.tolist()
del Temp

Header('Numeric Columns')
print(', '.join(Numeric_Columns))
Line()
Header('Categorical Columns')
print(', '.join(Categorical_Columns))
Line()
Numeric Columns ====================================================================================
BsmtFinSF1, BsmtUnfSF, GarageArea, BsmtHalfBath, BsmtFullBath, LotFrontage, BsmtFinSF2, MasVnrArea, TotalBsmtSF, MSSubClass, KitchenAbvGr, YearRemodAdd, LowQualFinSF, GrLivArea, GarageYrBlt, GarageCars, LotArea, 1stFlrSF, MoSold, OpenPorchSF, OverallCond, OverallQual, PoolArea, ScreenPorch, TotRmsAbvGrd, WoodDeckSF, YearBuilt, MiscVal, FullBath, HalfBath, YrSold, Fireplaces, 2ndFlrSF, 3SsnPorch, BedroomAbvGr, EnclosedPorch
====================================================================================================
Categorical Columns ================================================================================
CentralAir, BsmtQual, Neighborhood, BsmtFinType2, BsmtFinType1, BsmtExposure, PavedDrive, BsmtCond, RoofMatl, MiscFeature, RoofStyle, SaleCondition, SaleType, BldgType, Street, Alley, Utilities, PoolQC, MasVnrType, Functional, MSZoning, FireplaceQu, GarageCond, GarageFinish, GarageQual, GarageType, Fence, Exterior2nd, Exterior1st, Heating, Foundation, HeatingQC, ExterQual, KitchenQual, LandContour, LandSlope, ExterCond, LotConfig, LotShape, Electrical, Condition2, HouseStyle, Condition1
====================================================================================================

Regressors

In this section, we test a number of efficient scikit-learn regressors. Then from those that have performed well, a stacked model can be formed. In particular, we use the following models:

Regressor Link
Bagging Regressorr sklearn.ensemble.BaggingRegressor
Decision Tree Regressor sklearn.tree.DecisionTreeRegressor
Gradient Boosting Regressor sklearn.ensemble.GradientBoostingRegressor
MLP Regressor sklearn.neural_network.MLPRegressor
Random Forest Regressor sklearn.ensemble.RandomForestRegressor
In [7]:
Regressors = {'Bagging Regressorr': BaggingRegressor(),
              'Decision Tree Regression': DecisionTreeRegressor(),
              'Gradient Boosted Tree': GradientBoostingRegressor(),
              'MLP Regressor': MLPRegressor(),
              'RandomForest Regressor': RandomForestRegressor()}

TestR2 = {}
Threshold = .8
flag = True
RegList = list(Regressors.keys())
N = len(RegList)

preprocessor = make_column_transformer((StandardScaler(),  tuple(Numeric_Columns)),
                                       (OneHotEncoder(handle_unknown='ignore') , tuple(Categorical_Columns)))

def ScoreTable(reg, RegName, X = X, y = y):
    score = cross_validate(reg, X, y, scoring=['r2', 'neg_mean_absolute_error','neg_mean_squared_error'],
                       return_train_score = True, n_jobs=-1, verbose=0)
    score['test_neg_mean_absolute_error'] = -score['test_neg_mean_absolute_error']
    score['train_neg_mean_absolute_error'] = -score['train_neg_mean_absolute_error']

    Scores = pd.DataFrame(score).mean(axis = 0).map(lambda x: ('%.2f' % x)) + ' ± ' + \
            pd.DataFrame(score).std(axis = 0).map(lambda x: ('%.2e' % x))
    Scores = Scores.to_frame(RegName)
    Temp = [x.replace('_',' ').title().replace('Train','Train:').replace('Test','Test:').replace('Neg','')\
            for x in Scores.index]
    Scores.index = Temp
    return score, Scores

def plot_regression_results(reg, RegName, ax = False, X=X, y=y):
    y_pred = cross_val_predict(reg, X, y, n_jobs=-1, verbose=0)
    if not ax:
        fig, ax = plt.subplots(figsize=(6,6))
        
    _ = ax.scatter(y, y_pred, facecolors='SkyBlue', edgecolors='MidnightBlue', alpha = 0.8)
    Min = np.floor(np.stack((y.values,y_pred), axis = 0).min())
    Max = np.ceil(np.stack((y.values,y_pred), axis = 0).max())
    _ = ax.set_xlim(left = Min, right = Max)
    _ = ax.set_ylim(bottom = Min, top = Max)
    del Min, Max
    _ = ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [ax.get_ylim()[0], ax.get_ylim()[1]], '--r', linewidth=2)
    _ = ax.set_xlabel('Measured log(Sale Price)', fontsize=14)
    _ = ax.set_ylabel('Predicted log(Sale Price)', fontsize=14)
    _ = ax.set_title(RegName, fontsize = 18)
    _ = ax.set(aspect='equal')
In [8]:
# Plot
rows = int(np.ceil(N/2))
fig, axes = plt.subplots(nrows= rows, ncols=2, figsize=(14, rows*7))
axes = axes.ravel()
if len(axes)>N:
    _ = fig.delaxes(axes[-1])
        
# Progressbar
Progress_Bar = progressbar.ProgressBar(maxval = N, widgets=[progressbar.Bar('=', '|', '|'), progressbar.Percentage()])
Progress_Bar.start()

for Counter in range(N):
    #
    Progress_Bar.update(Counter)
    RegName = RegList[Counter]
    reg = make_pipeline(preprocessor, Regressors[RegName])
    _ = reg.fit(X, y)
    score, Temp = ScoreTable(reg, RegName)
    if flag:
        Scores = Temp.copy()
    else:
        Scores = pd.concat([Scores,Temp], axis = 1)
    flag = False
    #
    if score['test_r2'].mean() >= Threshold:
        TestR2[RegName] = score['test_r2'].mean()
    
    ax = axes[Counter]
    plot_regression_results(reg, RegName, ax)
Progress_Bar.finish()
del ax, axes
|=========================================================================|100%

Moreover, the following table highlights the performance of the regressors.

In [9]:
display(Scores)
Bagging Regressorr Decision Tree Regression Gradient Boosted Tree MLP Regressor RandomForest Regressor
Fit Time 0.72 ± 1.80e-03 0.13 ± 1.33e-03 0.74 ± 2.46e-02 1.21 ± 1.08e-01 6.77 ± 2.53e-02
Score Time 0.02 ± 5.49e-04 0.01 ± 7.09e-04 0.01 ± 5.52e-04 0.01 ± 1.37e-04 0.02 ± 3.61e-03
Test: R2 0.86 ± 1.50e-02 0.73 ± 4.83e-02 0.90 ± 7.61e-03 0.75 ± 1.13e-01 0.87 ± 1.09e-02
Train: R2 0.97 ± 2.13e-03 1.00 ± 0.00e+00 0.96 ± 1.09e-03 0.93 ± 4.54e-03 0.98 ± 6.61e-04
Test: Mean Absolute Error 0.10 ± 6.25e-03 0.14 ± 1.55e-02 0.09 ± 4.01e-03 0.12 ± 1.29e-02 0.10 ± 3.13e-03
Train: Mean Absolute Error 0.04 ± 1.09e-03 0.00 ± 2.50e-18 0.06 ± 8.18e-04 0.08 ± 1.72e-03 0.04 ± 8.06e-04
Test: Mean Squared Error -0.02 ± 3.90e-03 -0.04 ± 9.47e-03 -0.02 ± 2.65e-03 -0.04 ± 1.73e-02 -0.02 ± 2.41e-03
Train: Mean Squared Error -0.00 ± 3.75e-04 -0.00 ± 4.44e-33 -0.01 ± 2.39e-04 -0.01 ± 5.71e-04 -0.00 ± 1.47e-04
In [10]:
Final_Est = np.array(list(TestR2.values()))
Final_Est = np.where (Final_Est == Final_Est.max())[0][0]
Final_Est = list(TestR2.keys())[Final_Est]
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Best Performing Model:' + Style.RESET_ALL + ' ' + Final_Est)
Best Performing Model: Gradient Boosted Tree
In [11]:
model = make_pipeline(preprocessor, Regressors[Final_Est])
_ = model.fit(X, y)
_, Score = ScoreTable(model, Final_Est)
display(Score)
plot_regression_results(model, Final_Est)
Gradient Boosted Tree
Fit Time 0.70 ± 2.11e-02
Score Time 0.01 ± 1.59e-03
Test: R2 0.90 ± 9.47e-03
Train: R2 0.96 ± 1.09e-03
Test: Mean Absolute Error 0.09 ± 4.14e-03
Train: Mean Absolute Error 0.06 ± 8.18e-04
Test: Mean Squared Error -0.02 ± 2.96e-03
Train: Mean Squared Error -0.01 ± 2.39e-04

Final Predictions

In [12]:
Pred = model.predict(Test.drop(columns = 'ID'))
Pred = np.exp(Pred)
# Pred = np.round(Pred,0)
Predictions = Test.copy()
Predictions[Target] = Pred
Predictions = Predictions[['ID', 'SalePrice']]
display(Predictions.reset_index(drop = True))
Predictions.columns = ['Id', 'SalePrice']
Predictions.to_csv(PATH + '/Submission.csv', index=False)
ID SalePrice
0 1461 122783.831101
1 1462 153457.609189
2 1463 182566.886057
3 1464 185405.512113
4 1465 194409.598000
... ... ...
1454 2915 78918.169132
1455 2916 80850.131258
1456 2917 151040.573668
1457 2918 118158.328051
1458 2919 235870.889944

1459 rows × 2 columns